This note project folder demonstrate the basics of spatial data analysis with R.
library(tidyverse)
library(ggthemes)
theme_set(theme_map())
library(sf)
sf_use_s2(FALSE)
library(rnaturalearth)
library(rnaturalearthdata)
Introducing a new format of geo-referenced data.
world = ne_countries(scale = "medium", type = "map_units", returnclass = "sf")
# world_2 = ne_countries(scale = "medium", type = "map_units", returnclass = "sf")
# View(world_2 |> select(geometry))
world |> ggplot() + geom_sf()
names(world)
## [1] "scalerank" "featurecla" "labelrank" "sovereignt" "sov_a3"
## [6] "adm0_dif" "level" "type" "admin" "adm0_a3"
## [11] "geou_dif" "geounit" "gu_a3" "su_dif" "subunit"
## [16] "su_a3" "brk_diff" "name" "name_long" "brk_a3"
## [21] "brk_name" "brk_group" "abbrev" "postal" "formal_en"
## [26] "formal_fr" "note_adm0" "note_brk" "name_sort" "name_alt"
## [31] "mapcolor7" "mapcolor8" "mapcolor9" "mapcolor13" "pop_est"
## [36] "gdp_md_est" "pop_year" "lastcensus" "gdp_year" "economy"
## [41] "income_grp" "wikipedia" "fips_10" "iso_a2" "iso_a3"
## [46] "iso_n3" "un_a3" "wb_a2" "wb_a3" "woe_id"
## [51] "adm0_a3_is" "adm0_a3_us" "adm0_a3_un" "adm0_a3_wb" "continent"
## [56] "region_un" "subregion" "region_wb" "name_len" "long_len"
## [61] "abbrev_len" "tiny" "homepart" "geometry"
d = read_csv("Lec_09/data/GEDEvent_v22_1.csv")
d_event_2021 = d |> filter(year == 2021)
ggplot() +
geom_sf(data = world) +
geom_point(data = d_event_2021,
aes(x = longitude, y = latitude),
alpha = 0.2)
Look at only conflicts in Africa.
ggplot() +
geom_sf(data = world |> filter(region_un == "Africa")) +
geom_point(data = d_event_2021 |> filter(region == "Africa"),
aes(x = longitude, y = latitude),
alpha = 0.2)
d_country_2021 = d |>
filter(year == 2021) |>
group_by(country_id, country, region) |>
summarise(
n_conflict = n()
)
d_country_2021 = d_country_2021 |> arrange(-n_conflict)
world_m = world |>
left_join(d_country_2021, by = c("sovereignt" = "country"))
world_m |>
select(sovereignt, n_conflict)
## Simple feature collection with 264 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -89.99893 xmax: 180 ymax: 83.59961
## CRS: +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
## First 10 features:
## sovereignt n_conflict geometry
## 1 Netherlands NA MULTIPOLYGON (((-69.89912 1...
## 2 Antigua and Barbuda NA MULTIPOLYGON (((-61.71606 1...
## 3 Antigua and Barbuda NA MULTIPOLYGON (((-61.74712 1...
## 4 Afghanistan 4510 MULTIPOLYGON (((74.89131 37...
## 5 Angola 4 MULTIPOLYGON (((14.19082 -5...
## 6 United Kingdom NA MULTIPOLYGON (((-63.00122 1...
## 7 Albania NA MULTIPOLYGON (((20.06396 42...
## 8 Finland NA MULTIPOLYGON (((20.61133 60...
## 9 Andorra NA MULTIPOLYGON (((1.706055 42...
## 10 United Arab Emirates NA MULTIPOLYGON (((53.92783 24...
summary(world_m$n_conflict)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 4.0 42.0 257.4 172.2 4510.0 208
ggplot() +
geom_sf(data = world_m, aes(fill = n_conflict)) +
scale_fill_viridis_c(option = "B", direction = -1, trans = "log")
Reproject the map to change the sizes regions according to a variable of interest.
Read this Wikipedia page for a detailed introduction of Cartogram: https://en.wikipedia.org/wiki/Cartogram
Case: Draw Cartograms of conflicts in Africa
Personally, this is the most interesting type of map.
# install.packages("cartogram")
library(cartogram)
# Cartogram by the number of conflict events in Africa
world_m_africa = world_m |>
filter(region == "Africa") |>
mutate(geometry = st_transform(geometry, 3857)) # Specify a projection of the map. Essential
# NEW: Fill in NA n_conflict
summary(world_m_africa$n_conflict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 5.5 52.0 122.7 193.0 627.0
# Re-project the map using the cartogram package
world_m_africa_cart = world_m_africa |>
cartogram_cont(weight = "n_conflict")
# Plot a choropleth map
ggplot(data = world_m_africa) +
geom_sf(aes(fill = n_conflict)) +
geom_sf_label(aes(label = sovereignt)) +
scale_fill_viridis_c(option = "B", direction = -1, trans = "log")
# Plot a cartogram
ggplot(data = world_m_africa_cart) +
geom_sf() +
geom_sf_label(aes(label = sovereignt))
# Diagrammic (Dorling) catograms -- some further extraction
world_m_africa_cart_dorling = world_m_africa |>
cartogram_dorling(weight = "n_conflict")
ggplot(data = world_m_africa_cart_dorling) +
geom_sf() +
geom_sf_text(aes(label = sovereignt))
In cross-country analysis, what brings the most trouble is the matching of country identifiers. One thing that goes unnoticed in the above steps is that some contries “fell out” of the study siliently. We have a handy tool to help match countries from different datasets correctly.
# Identify the problem. Which countries fell out of the study because their identifies do not match?
# Use the anti_join function
d_country_2021 |>
anti_join(world, c("country" = "sovereignt"))
## # A tibble: 6 × 4
## # Groups: country_id, country [6]
## country_id country region n_conflict
## <dbl> <chr> <chr> <int>
## 1 775 Myanmar (Burma) Asia 848
## 2 490 DR Congo (Zaire) Africa 781
## 3 678 Yemen (North Yemen) Middle East 764
## 4 572 Kingdom of eSwatini (Swaziland) Africa 10
## 5 510 Tanzania Africa 8
## 6 365 Russia (Soviet Union) Europe 2
# Keep the entries that are NOT matched
# install.packages("countrycode")
library(countrycode)
d_country_2021_t = d_country_2021 |>
mutate(
iso3c = countrycode(country, "country.name", "iso3c")
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso3c = countrycode(country, "country.name", "iso3c")`.
## ℹ In group 48: `country_id = 678`, `country = "Yemen (North Yemen)"`.
## Caused by warning in `countrycode_convert()`:
## ! Some values were not matched unambiguously: Yemen (North Yemen)
# Check: There is a warning --> Some country names fail to be parsed.
d_country_2021_t |>
select(country, iso3c) |>
filter(is.na(iso3c))
## # A tibble: 1 × 3
## # Groups: country_id, country [1]
## country_id country iso3c
## <dbl> <chr> <chr>
## 1 678 Yemen (North Yemen) <NA>
# Remove the (North Yemen) part and redo it
d_country_2021_t = d_country_2021 |>
mutate(
country = recode(country, "Yemen (North Yemen)" = "Yemen")
) |>
mutate(
iso3c = countrycode(country, "country.name", "iso3c")
)
# Re-do the steps that join the world map and the data. This time, using ISO3C as the country identifier.
world_m = world |>
left_join(d_country_2021_t, by = c("iso_a3" = "iso3c"))
# Join continent's names using countrycode
world_m = world_m |>
mutate(
continent = countrycode(iso_a3, "iso3c", "continent")
)
## Warning: There was 1 warning in `stopifnot()`.
## ℹ In argument: `continent = countrycode(iso_a3, "iso3c", "continent")`.
## Caused by warning in `countrycode_convert()`:
## ! Some values were not matched unambiguously: ATA, ATF, CCK, HMD, IOT, SGS
# Cartogram by the number of conflict events in Africa
world_m_africa = world_m |>
filter(continent == "Africa") |>
mutate(geometry = st_transform(geometry, 3857))
summary(world_m_africa$n_conflict)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 6.75 52.00 135.96 193.00 781.00 30
# # Impute ZEROS
# world_m_africa = world_m_africa |>
# mutate(n_conflict = replace_na(n_conflict, 1))
# summary(world_m_africa$n_conflict)
# Re-project the map using the cartogram package
world_m_africa_cart = world_m_africa |>
cartogram_cont(weight = "n_conflict")
## Warning in cartogram_cont.sf(world_m_africa, weight = "n_conflict"): NA not
## allowed in weight vector. Features will be removed from Shape.
# Plot a choropleth map
ggplot(data = world_m_africa) +
geom_sf(aes(fill = n_conflict)) +
geom_sf_label(aes(label = sovereignt)) +
scale_fill_viridis_c(option = "B", direction = -1, trans = "log")
# Plot a cartogram
ggplot(data = world_m_africa_cart) +
geom_sf() +
geom_sf_label(aes(label = iso_a3))
# Diagrammic (Dorling) catograms
world_m_africa_cart_dorling = world_m_africa |>
filter(!is.na(n_conflict)) |>
# Note: The cartogram_dorling function will report error if NA values exist.
cartogram_dorling(weight = "n_conflict")
ggplot(data = world_m_africa_cart_dorling) +
geom_sf() +
geom_sf_text(aes(label = sovereignt))